In [2]:
import pandas as pd
import numpy as np
import json

from bokeh.resources import INLINE
from ipywidgets import interact
from bokeh.io import push_notebook  ,output_notebook , show , output_file
from bokeh.plotting import figure
from bokeh.models.widgets import Select 
from bokeh.models import HoverTool, ColumnDataSource , CheckboxGroup , CustomJS
from bokeh.layouts import widgetbox , row , layout

import seaborn as sns
%matplotlib inline
sns.set(style="white", palette="muted", color_codes=True)

store = pd.HDFStore('../../data/processed/orbf_benin.h5')
data = store['data']
store.close()

colormap = json.load(open('../../references/departements_colors.json'))

In [3]:
len(data)

a = data.groupby(['filetype_name' , 'entity_id' , 'period']).apply(len)

In [4]:
len(a['Facture Mensuelle PMA'])


Out[4]:
16131

High Level Data description

Full downloaded data has 564549 indicator/facility/months data points. We limit this data set to only data points with a validation, which reduces the data set to 434292 indicator/facility/months, representing 77% of the complete data.


In [5]:
data.filetype_name.value_counts()


Out[5]:
Facture Mensuelle PMA    406503
Name: filetype_name, dtype: int64

In [6]:
def make_timeline_data(data) :
    out = len(data)
    data['correction'] = (data.indicator_claimed_value == data.indicator_verified_value)
    
    perc_right = sum(data['correction']) / len(data)
    
    return pd.DataFrame([[out, perc_right]])

timeline_data = data.groupby(['entity_name' , 'entity_type' ,'period']).apply(make_timeline_data).reset_index()
timeline_data.columns = ['entity_name', 'entity_type' ,'date' , 'level_1' , 'value' , 'percent_right']

In [7]:
def make_fac_desc(data) :
    start_reporting = min(data.period)
    n_values = len(data)
    entity_type = data.entity_type.unique()[0]
    data['correction'] = (data.indicator_claimed_value == data.indicator_verified_value)
    
    perc_right = sum(data['correction']) / len(data)
    
    return pd.DataFrame([[start_reporting ,n_values , entity_type , perc_right]])

fac_desc = data.groupby('entity_name').apply(make_fac_desc).reset_index()
fac_desc.columns = ['entity_name' , 'level_1' ,'date_start' , 'n_values' , 'entity_type' , 'percent_right']

In [8]:
result = fac_desc.sort_values(['entity_type' ,'date_start', 'n_values' , 'entity_name'], ascending=[ 0 , 0 , 1 , 0])

In [9]:
timeline_data.entity_name_cat = data.entity_name.astype('category').cat.set_categories(result['entity_name'], ordered=True)

In [10]:
colors = []
for i in range(len(timeline_data)) :
    colors.append(colormap[timeline_data.loc[i , 'entity_type']])

We first look at the complete data to see how frequently correction happens in records.

The color is the type of facility we are considering. The shade is the percentage of reported indicator values that have been validated without being corrected.


In [12]:
#output_notebook(INLINE)

output_file('fac_data_quality')

In [ ]:
print(np.median(timeline_data.percent_right))

print(np.average(timeline_data.percent_right))

print(np.average(timeline_data.percent_right[timeline_data.percent_right < 1]))

print(np.median(timeline_data.percent_right[timeline_data.percent_right < 1]))

In [14]:
ax = sns.distplot(timeline_data.percent_right[timeline_data.percent_right < 1] , color="b" , bins=12)
ax.set(xlabel='% of correct indicators')
sns.plt.xlim(0,1.1)

ax.figure.savefig("../../reports/figures/correct_dist.pdf", dpi=1200)


C:\Users\grlurton\AppData\Local\Continuum\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [13]:
source = ColumnDataSource(data=dict(
    xname=timeline_data.date.astype(str).tolist(),
    yname=timeline_data.entity_name.tolist() ,
    colors=colors,
    alphas= timeline_data.percent_right.tolist() ,
    type_fac = timeline_data.entity_type ,
))


TOOLS = "hover,save,pan,box_zoom,wheel_zoom,reset"

p = figure(title="Monthly Reporting by facility",
           x_axis_location="above", tools=TOOLS, 
           x_range=sorted(list(set(timeline_data.date.astype(str).tolist()))), 
           y_range=result['entity_name'].astype(str).tolist())

p.plot_width = 800
p.plot_height = 1200
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "7pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi/3

p.rect('xname', 'yname', 0.9, 0.9,  source=source,
       color='colors', alpha='alphas', line_color=None,
       hover_line_color='black', hover_color='colors')
p.select_one(HoverTool).tooltips = [
    ('Fosa', '@yname') ,
    ('Period' , '@xname') ,
    ('% Right' , '@alphas')
]


show(p)

We can see data has been entered since March 2012 in a group of facilities, and is entered regularly until november 2016. A second group of facilities start reporting in July 2015 and appear to stop reporting one year later. Some facilites start reporting around April 2014 and have been reporting since then.

We see facility specific patterns, with some facilities being consistently less corrected than others. This can be added to a period effect, withy apparently less correction at the end of periods. This could be interrogated.

Amount Spent


In [16]:
zone_color_dico = json.load(open('../../references/departements_colors.json'))
data['parent_geozone_name_color'] = ""
data['indicator_color'] = ""
for i in sorted(data.parent_geozone_name.unique().tolist()) :
    data.loc[data.parent_geozone_name == i , 'parent_geozone_name_color'] = zone_color_dico[i]

In [17]:
def received_amounts(data):
    return data.indicator_montant.sum(skipna = True)

amount_fac = data.groupby(['entity_id' , 'date' , 'entity_type' , 'entity_name' , 'parent_geozone_name' , 'parent_geozone_name_color']).apply(received_amounts)

amount_fac = amount_fac[amount_fac < 50000000]

amount_fac = amount_fac.reset_index()
amount_fac.columns = ['entity_id' , 'date' , 'entity_type' , 'entity_name' ,'parent_geozone_name' , 'parent_geozone_name_color' , 'value']

In [18]:
ax = sns.distplot(amount_fac.value , color="b" , bins=50)
ax.set(xlabel='% of correct indicators')
sns.plt.xlim(0,)

ax.figure.savefig("../../reports/figures/payment_distribution.pdf", dpi=1200)


C:\Users\grlurton\AppData\Local\Continuum\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [19]:
gammas = sns.load_dataset("gammas")

In [24]:
gammas.head()


Out[24]:
timepoint ROI subject BOLD signal
0 0.0 IPS 0 0.513433
1 0.0 IPS 1 -0.414368
2 0.0 IPS 2 0.214695
3 0.0 IPS 3 0.814809
4 0.0 IPS 4 -0.894992

In [22]:
amount_fac.head()


Out[22]:
entity_id date entity_type entity_name parent_geozone_name parent_geozone_name_color value
0 2 2012-03-01 Maternité isolée Csa Kpovie Mi ATLANTIQUE #1f78b4 136500.0
1 2 2012-04-01 Maternité isolée Csa Kpovie Mi ATLANTIQUE #1f78b4 136350.0
2 2 2012-05-01 Maternité isolée Csa Kpovie Mi ATLANTIQUE #1f78b4 189450.0
3 2 2012-06-01 Maternité isolée Csa Kpovie Mi ATLANTIQUE #1f78b4 85650.0
4 2 2012-07-01 Maternité isolée Csa Kpovie Mi ATLANTIQUE #1f78b4 161100.0

In [21]:
ax = sns.tsplot(data = amount_fac , time = 'date' , value = 'value'  , color="b" )
ax.set(xlabel='% of correct indicators')
#sns.plt.xlim(0,)

#ax.figure.savefig("../../reports/figures/payment_distribution.pdf", dpi=1200)


Out[21]:
[<matplotlib.text.Text at 0xc24f845a58>]

In [12]:
def make_multiline_data(data):
    xs = [data.date.tolist()]
    ys = [data.value.tolist()]
    color = [data.parent_geozone_name_color.tolist()]
    return pd.DataFrame([[xs[0] , ys[0]]])

amount_fac = amount_fac.groupby(['entity_id' ,'entity_type' , 'entity_name' , 'parent_geozone_name' , 'parent_geozone_name_color']).apply(make_multiline_data)
amount_fac = amount_fac.reset_index()
amount_fac.columns = ['entity_id' ,'entity_type' , 'entity_name' , 'parent_geozone_name' , 'parent_geozone_name_color', 'level' , 'xs' , 'ys']

In [13]:
dept = ['All'] + sorted(data.parent_geozone_name.unique())
indics = ['All'] + sorted(data.indicator_label.unique())

In [14]:
output_notebook(INLINE)


Loading BokehJS ...

In [15]:
source = ColumnDataSource(data=dict(xs = [] , ys = [] , color = [], facility_name = [] , legend = []))

TOOLS = "hover,save,pan,box_zoom,wheel_zoom,reset"
p = figure(x_axis_type="datetime",  plot_height=600, plot_width=700, title="Evolution of amounts paid" , tools=TOOLS)
p.grid.grid_line_alpha=0.3
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Amount Paid'
p.multi_line('xs' , 'ys',  source = source , color='color' , alpha=0.4)
p.select_one(HoverTool).tooltips = [("Facility", "@facility_name") ,
                                    ("Département" , "@legend")
                                   ]

def update(Zones = "All"):
    df = amount_fac
    if (Zones != "All"):
        df = df[df.parent_geozone_name.str.contains(Zones)==True]
    source.data = dict(
        color = df['parent_geozone_name_color'].tolist() ,
        xs=df['xs'].tolist(),
        ys=df['ys'].tolist(),
        facility_name = df['entity_name'].tolist(),
        legend = df['parent_geozone_name'].tolist(),
    )
    push_notebook()

show(p, notebook_handle=True)
update()



In [16]:
interact(update, Zones=dept)


Out[16]:
<function __main__.update>

There are large variations in the amounts of money paid to different facilities for RBF. Variations happen within facilities as well as between different facilites. Amplitudes of variations do not appear to have evidently defined patterns.

Comparing errors


In [17]:
verif_data = data
verif_data['difference_indicateur'] = verif_data.indicator_verified_value - verif_data.indicator_claimed_value
verif_data['perc_variation_indicator'] = verif_data['difference_indicateur'] / verif_data['indicator_claimed_value']

In [18]:
verif_data = verif_data[(verif_data.indicator_claimed_value < 5000.0) &
                        (verif_data.perc_variation_indicator < 20) 
                       ]

In [19]:
output_notebook(INLINE)


Loading BokehJS ...

In [20]:
source = ColumnDataSource(data=dict(xs = [] , ys = [] , facility_name = [] , departement = [] , indicator = []))

TOOLS = "hover,save,pan,box_zoom,wheel_zoom,reset"
p = figure(plot_height=600, plot_width=700, title="Evolution of amounts paid" , tools=TOOLS)
p.grid.grid_line_alpha=0.3
p.xaxis.axis_label = 'Reported Amount'
p.yaxis.axis_label = '% Variation'
p.circle('xs' , 'ys',  source = source , alpha=0.4)
p.select_one(HoverTool).tooltips = [("Facility", "@facility_name") ,
                                    ("Département" , "@departement") ,
                                    ('Indicator' , "@indicator")
                                   ]

def update(departement = "All" , Indicateurs = 'All'):
    df = verif_data
    if (departement != "All"):
        df = df[df.parent_geozone_name.str.contains(departement)==True]
        indics = ['All'] + sorted(df.indicator_label.unique())
    if (Indicateurs != "All"):
        df = df[df.indicator_label == Indicateurs]
    source.data = dict(
        xs=df['indicator_claimed_value'].tolist(),
        ys=df['perc_variation_indicator'].tolist(),
        facility_name = df['entity_name'].tolist(),
        departement = df['parent_geozone_name'].tolist(),
        indicator = df['indicator_label'].tolist()
    )
    push_notebook()

show(p, notebook_handle=True)
update()



In [21]:
interact(update, departement=dept , Indicateurs = indics)


Out[21]:
<function __main__.update>

Amount Correction


In [22]:
def correct_amount(data) :
    data.indicator_claimed_value = sum(data.indicator_claimed_value)
    data.difference_indicateur = sum(data.indicator_verified_value - data.indicator_claimed_value)
    data.claimed_montant = sum(data.indicator_claimed_value * data.indicator_tarif)
    data.difference_montant = sum(data.indicator_montant - data.claimed_montant)
    return pd.DataFrame([[data.claimed_montant.tolist() , 
                          data.difference_montant.tolist() , 
                          data.difference_indicateur.tolist() ,
                        data.indicator_claimed_value.tolist()[0]]])

verif_data_report = data.groupby(['entity_id' , 'entity_name' , 'parent_geozone_name' ,'date']).apply(correct_amount)

verif_data_report = verif_data_report.reset_index()

verif_data_report.columns = ['entity_id' , 'entity_name' , 'parent_geozone_name' , 'date' , 'level_5', 
                      'claimed_montant' , 'difference_montant' , 'difference_indicateur' , 'claimed_indicateur']

verif_data_report['perc_variation_montant'] = verif_data_report['difference_montant'] / verif_data_report['claimed_montant']


#verif_data_report = verif_data_report[(np.abs(verif_data_report.difference_montant) < 500000000)]
print(np.median(verif_data_report.difference_montant))


-977555285.0

In [119]:
verif_data_report_plot = verif_data_report[(np.abs(verif_data_report.difference_montant) <10000000000)]
ax = sns.distplot(verif_data_report_plot.difference_montant , color="b")
ax.set(xlabel='Correction of payment after data validation')
ax.figure.savefig("../../reports/figures/payment_correction.png", dpi=1200)


C:\Users\grlurton\AppData\Local\Continuum\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [118]:
np.median(verif_data_report.difference_montant[(np.abs(verif_data_report.difference_montant) <10000000000)])


Out[118]:
-970017945.0

In [ ]:
ax = sns.distplot(timeline_data.percent_right[timeline_data.percent_right < 1] , color="b" , bins=12)
ax.set(xlabel='% of correct indicators')
sns.plt.xlim(0,1.1)

ax.figure.savefig("../../reports/figures/correct_dist.pdf", dpi=1200)

In [131]:
import matplotlib.pyplot as plt
sns.despine(left=True)

verif_data_report_plot2 = verif_data_report_plot[verif_data_report_plot.entity_name.isin(['Csa Dekanme Cs' , 'Tohoue Csc' , 'Bembe Di' , 'Zounta Mi'])]

g = sns.FacetGrid(verif_data_report_plot2, col="entity_name", col_wrap=2, size=5 , sharex = False , sharey = False )
g = g.map(sns.distplot, "difference_montant")

g.fig.savefig("../../reports/figures/facility_correct.pdf", dpi=1000)


C:\Users\grlurton\AppData\Local\Continuum\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
<matplotlib.figure.Figure at 0x413f0b6d8>

In [ ]: